#################################
# COVARIATE CHECK: IGNORABILITY #
#################################
# Author: Kasia Nalewajko
# First created: 09 March 2023
# Replicated: 09 June 2024

rm(list = ls())

# LOAD PACKAGES -----------------------------------------------------------

if (!require("dplyr")) install.packages("dplyr")
if (!require("estimatr")) install.packages("estimatr")
if (!require("haven")) install.packages("haven")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("stringr")) install.packages("stringr")
if (!require("sf")) install.packages("sf")

# LOAD DATA ---------------------------------------------------------------

load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main.Rda")
data_1872 <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/census1872small.dta")
more_1872 <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/canton_dataset.dta")
load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/communes_deppct.Rda")
cantonsxy <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/cantonsxy.dta")
cantons_sf <- sf::read_sf("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/Gay_CANTONS_1940/CANTONS_1940.shp")

# CLEAN ----
# ADD SQUICCIARINI TO MAIN DATA FRAME ------

# Assign Squicciarini units to 1940 counties
# overlay the coordinates with dept codes
squi_coords <- unique(cantonsxy[, c("X", "Y")])

# create a points collection
squi_coords_sf <- do.call("st_sfc",c(lapply(1:nrow(squi_coords), # `LONG` needs to be first
                                            function(i) {sf::st_point(as.numeric(squi_coords[i, ]))}), list("crs" = 4326)))

squi_coords_trans <- sf::st_transform(squi_coords_sf, 2163) # apply transformation to coords sf
cantons_trans <- sf::st_transform(cantons_sf, 2163) # apply transformation to polygons sf

# intersect and extract departement canton code
squi_coords$deppct <- apply(st_intersects(cantons_trans, squi_coords_trans, sparse = FALSE), 2,
                            function(col) {
                              cantons_trans[which(col), ]$deppct
                            })

sum(stringr::str_count(string = squi_coords$deppct, pattern = "numeric")) # 6 missing

squi_coords$deppct <- as.numeric(squi_coords$deppct)
squi_coords <- squi_coords %>% filter(!is.na(deppct))

temp <- left_join(cantonsxy, squi_coords) %>% 
  dplyr::select(ID_canton, deppct) %>% 
  filter(!is.na(deppct))

more_1872 <- left_join(more_1872, temp) %>% 
  filter(!is.na(deppct)) %>% 
  group_by(deppct) %>% 
  summarise(HH_expend1901 = mean(HH_expend, na.rm = T),
            share_cath_schools1894 = mean(share_cath_schools1894, na.rm = T),
            schools_total1894 = mean(exp(lnschools_total1894), na.rm = T)) %>% 
  mutate_all(~ifelse(is.nan(.), NA, .))

main <- left_join(main, more_1872)

# ADD 1872 CENSUS TO MAIN DATA FRAME -------

census1872_deppct <- left_join(data_1872, communes_deppct) %>% 
  mutate(pop1872 = exp(logtotalpop1872),
         male1872 = share_male1872*pop1872,
         unemployed1872 = share_unemployment1872*pop1872,
         liberalprofs1872 = share_liberals1872*pop1872,
         trade1872 = share_merchants1872*pop1872,
         industry1872 = share_industry1872*pop1872,
         agriculture1872 = share_agriculture1872*pop1872
  ) %>% 
  group_by(deppct) %>% 
  summarise(pop1872 = sum(pop1872, na.rm = T),
            male1872 = sum(male1872, na.rm = T),
            unemployed1872 = sum(unemployed1872, na.rm = T),
            liberalprofs1872 = sum(liberalprofs1872, na.rm = T),
            trade1872 = sum(trade1872, na.rm = T),
            industry1872 = sum(industry1872, na.rm = T),
            agriculture1872 = sum(agriculture1872, na.rm = T),
            male1872_share = male1872/pop1872,
            unemployed1872_share = unemployed1872/pop1872,
            liberalprofs1872_share = liberalprofs1872/pop1872,
            trade1872_share = trade1872/pop1872,
            industry1872_share = industry1872/pop1872,
            agriculture1872_share = agriculture1872/pop1872
  ) %>% 
  dplyr::select(deppct, male1872_share, unemployed1872_share, liberalprofs1872_share, trade1872_share, industry1872_share, agriculture1872_share) %>% 
  filter(!is.na(deppct))

main <- left_join(main, census1872_deppct)

# ADD WWI BATTLES TO MAIN DATA FRAME ----

main$verdun <- ifelse(main$months_Verdun > 0, 1, NA)
main$verdun <- ifelse(main$months_Verdun == 0, 0, main$verdun)

patterns <- c("14_pct$", "1872_share$")
covariates <- grep(pattern = paste(patterns, collapse = "|"), x = names(main), value = T)
covariates <- c(covariates, "verdun", "marne", "somme", "aisne", "chemindesdames", "OutsideFrance")

# RUN THE MODELS -------

# Generate vector of outcomes
outcomes <- "mpf1000"

# Create empty containers
estimate <- rep(NA, 24*length(outcomes))
std_error <- rep(NA, 24*length(outcomes))
model <- rep(NA, 24*length(outcomes))
nobs <- rep(NA, 24*length(outcomes))
models_list <- list()
n <- length(outcomes)
term <- NULL
CI_lower <- rep(NA, 24*length(outcomes))
CI_upper <- rep(NA, 24*length(outcomes))

# Loop over outcomes
for(i in 1:length(outcomes)){
  
  # covariate 1
  models_list[[i]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ turnout_14_pct + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                        clusters = id_bureau,
                                        fixed_effects = as.factor(ar_name),
                                        data = main))
  estimate[[i]] <- models_list[[i]][["coefficients"]][1,1]
  std_error[[i]] <- models_list[[i]][["coefficients"]][1,2]
  term[[i]] <- models_list[[i]][["call"]][["formula"]][[2]][[3]]
  nobs[[i]] <- models_list[[i]][["nobs"]]
  model[[i]] <- "Electoral indicators (1914)"
  CI_lower[[i]] <- models_list[[i]][["coefficients"]][1,5]
  CI_upper[[i]] <- models_list[[i]][["coefficients"]][1,6]
  
  # covariate 2
  models_list[[i+1]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(empty_ballots_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+1]] <- models_list[[i+1]][["coefficients"]][1,1]
  std_error[[i+1]] <- models_list[[i+1]][["coefficients"]][1,2]
  model[[i+1]] <- "Electoral indicators (1914)"
  term[[i+1]] <- models_list[[i+1]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+1]] <- models_list[[i+1]][["nobs"]]
  CI_lower[[i+1]] <- models_list[[i+1]][["coefficients"]][1,5]
  CI_upper[[i+1]] <- models_list[[i+1]][["coefficients"]][1,6]
  
  # covariate 3
  models_list[[i+2]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(SFIO_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                            clusters = id_bureau,
                                            fixed_effects = as.factor(ar_name),
                                            data = main))
  estimate[[i+2]] <- models_list[[i+2]][["coefficients"]][1,1]
  std_error[[i+2]] <- models_list[[i+2]][["coefficients"]][1,2]
  model[[i+2]] <- "Electoral indicators (1914)"
  term[[i+2]] <- models_list[[i+2]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+2]] <- models_list[[i+2]][["nobs"]]
  CI_lower[[i+2]] <- models_list[[i+2]][["coefficients"]][1,5]
  CI_upper[[i+2]] <- models_list[[i+2]][["coefficients"]][1,6]
  
  # covariate 4
  models_list[[i+3]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(REPSOC_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                              clusters = id_bureau,
                                              fixed_effects = as.factor(ar_name),
                                              data = main))
  estimate[[i+3]] <- models_list[[i+3]][["coefficients"]][1,1]
  std_error[[i+3]] <- models_list[[i+3]][["coefficients"]][1,2]
  model[[i+3]] <- "Electoral indicators (1914)"
  term[[i+3]] <- models_list[[i+3]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+3]] <- models_list[[i+3]][["nobs"]]
  CI_lower[[i+3]] <- models_list[[i+3]][["coefficients"]][1,5]
  CI_upper[[i+3]] <- models_list[[i+3]][["coefficients"]][1,6]
  
  # covariate 5
  models_list[[i+4]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(RADSOC_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                                clusters = id_bureau,
                                                fixed_effects = as.factor(ar_name),
                                                data = main))
  estimate[[i+4]] <- models_list[[i+4]][["coefficients"]][1,1]
  std_error[[i+4]] <- models_list[[i+4]][["coefficients"]][1,2]
  model[[i+4]] <- "Electoral indicators (1914)"
  term[[i+4]] <- models_list[[i+4]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+4]] <- models_list[[i+4]][["nobs"]]
  CI_lower[[i+4]] <- models_list[[i+4]][["coefficients"]][1,5]
  CI_upper[[i+4]] <- models_list[[i+4]][["coefficients"]][1,6]
  
  # covariate 6
  models_list[[i+5]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(RADIND_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+5]] <- models_list[[i+5]][["coefficients"]][1,1]
  std_error[[i+5]] <- models_list[[i+5]][["coefficients"]][1,2]
  model[[i+5]] <- "Electoral indicators (1914)"
  term[[i+5]] <- models_list[[i+5]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+5]] <- models_list[[i+5]][["nobs"]]
  CI_lower[[i+5]] <- models_list[[i+5]][["coefficients"]][1,5]
  CI_upper[[i+5]] <- models_list[[i+5]][["coefficients"]][1,6]
  
  # covariate 7
  models_list[[i+6]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(PRDS_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+6]] <- models_list[[i+6]][["coefficients"]][1,1]
  std_error[[i+6]] <- models_list[[i+6]][["coefficients"]][1,2]
  model[[i+6]] <- "Electoral indicators (1914)"
  term[[i+6]] <- models_list[[i+6]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+6]] <- models_list[[i+6]][["nobs"]]
  CI_lower[[i+6]] <- models_list[[i+6]][["coefficients"]][1,5]
  CI_upper[[i+6]] <- models_list[[i+6]][["coefficients"]][1,6]
  
  # covariate 8
  models_list[[i+7]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(Progressistes_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+7]] <- models_list[[i+7]][["coefficients"]][1,1]
  std_error[[i+7]] <- models_list[[i+7]][["coefficients"]][1,2]
  model[[i+7]] <- "Electoral indicators (1914)"
  term[[i+7]] <- models_list[[i+7]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+7]] <- models_list[[i+7]][["nobs"]]
  CI_lower[[i+7]] <- models_list[[i+7]][["coefficients"]][1,5]
  CI_upper[[i+7]] <- models_list[[i+7]][["coefficients"]][1,6]
  
  # covariate 9
  models_list[[i+8]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(ALP_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+8]] <- models_list[[i+8]][["coefficients"]][1,1]
  std_error[[i+8]] <- models_list[[i+8]][["coefficients"]][1,2]
  model[[i+8]] <- "Electoral indicators (1914)"
  term[[i+8]] <- models_list[[i+8]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+8]] <- models_list[[i+8]][["nobs"]]
  CI_lower[[i+8]] <- models_list[[i+8]][["coefficients"]][1,5]
  CI_upper[[i+8]] <- models_list[[i+8]][["coefficients"]][1,6]
  
  # covariate 10
  models_list[[i+9]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(Divers_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                          clusters = id_bureau,
                                          fixed_effects = as.factor(ar_name),
                                          data = main))
  estimate[[i+9]] <- models_list[[i+9]][["coefficients"]][1,1]
  std_error[[i+9]] <- models_list[[i+9]][["coefficients"]][1,2]
  model[[i+9]] <- "Electoral indicators (1914)"
  term[[i+9]] <- models_list[[i+9]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+9]] <- models_list[[i+9]][["nobs"]]
  CI_lower[[i+9]] <- models_list[[i+9]][["coefficients"]][1,5]
  CI_upper[[i+9]] <- models_list[[i+9]][["coefficients"]][1,6]
  
  # covariate 11
  models_list[[i+10]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ verdun + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+10]] <- models_list[[i+10]][["coefficients"]][1,1]
  std_error[[i+10]] <- models_list[[i+10]][["coefficients"]][1,2]
  model[[i+10]] <- "WWI battles (1914-18)"
  term[[i+10]] <- models_list[[i+10]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+10]] <- models_list[[i+10]][["nobs"]]
  CI_lower[[i+10]] <- models_list[[i+10]][["coefficients"]][1,5]
  CI_upper[[i+10]] <- models_list[[i+10]][["coefficients"]][1,6]
  
  # covariate 12
  models_list[[i+11]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ marne + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+11]] <- models_list[[i+11]][["coefficients"]][1,1]
  std_error[[i+11]] <- models_list[[i+11]][["coefficients"]][1,2]
  model[[i+11]] <- "WWI battles (1914-18)"
  term[[i+11]] <- models_list[[i+11]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+11]] <- models_list[[i+11]][["nobs"]]
  CI_lower[[i+11]] <- models_list[[i+11]][["coefficients"]][1,5]
  CI_upper[[i+11]] <- models_list[[i+11]][["coefficients"]][1,6]
  
  # covariate 13
  models_list[[i+12]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ somme + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+12]] <- models_list[[i+12]][["coefficients"]][1,1]
  std_error[[i+12]] <- models_list[[i+12]][["coefficients"]][1,2]
  model[[i+12]] <- "WWI battles (1914-18)"
  term[[i+12]] <- models_list[[i+12]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+12]] <- models_list[[i+12]][["nobs"]]
  CI_lower[[i+12]] <- models_list[[i+12]][["coefficients"]][1,5]
  CI_upper[[i+12]] <- models_list[[i+12]][["coefficients"]][1,6]
  
  # covariate 14
  models_list[[i+13]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ aisne + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+13]] <- models_list[[i+13]][["coefficients"]][1,1]
  std_error[[i+13]] <- models_list[[i+13]][["coefficients"]][1,2]
  model[[i+13]] <- "WWI battles (1914-18)"
  term[[i+13]] <- models_list[[i+13]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+13]] <- models_list[[i+13]][["nobs"]]
  CI_lower[[i+13]] <- models_list[[i+13]][["coefficients"]][1,5]
  CI_upper[[i+13]] <- models_list[[i+13]][["coefficients"]][1,6]
  
  # covariate 15
  models_list[[i+14]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ chemindesdames + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+14]] <- models_list[[i+14]][["coefficients"]][1,1]
  std_error[[i+14]] <- models_list[[i+14]][["coefficients"]][1,2]
  model[[i+14]] <- "WWI battles (1914-18)"
  term[[i+14]] <- models_list[[i+14]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+14]] <- models_list[[i+14]][["nobs"]]
  CI_lower[[i+14]] <- models_list[[i+14]][["coefficients"]][1,5]
  CI_upper[[i+14]] <- models_list[[i+14]][["coefficients"]][1,6]
  
  # covariate 16
  models_list[[i+15]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ OutsideFrance + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+15]] <- models_list[[i+15]][["coefficients"]][1,1]
  std_error[[i+15]] <- models_list[[i+15]][["coefficients"]][1,2]
  model[[i+15]] <- "WWI battles (1914-18)"
  term[[i+15]] <- models_list[[i+15]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+15]] <- models_list[[i+15]][["nobs"]]
  CI_lower[[i+15]] <- models_list[[i+15]][["coefficients"]][1,5]
  CI_upper[[i+15]] <- models_list[[i+15]][["coefficients"]][1,6]
  
  # covariate 17
  models_list[[i+16]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(unemployed1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+16]] <- models_list[[i+16]][["coefficients"]][1,1]
  std_error[[i+16]] <- models_list[[i+16]][["coefficients"]][1,2]
  model[[i+16]] <- "1872 census"
  term[[i+16]] <- models_list[[i+16]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+16]] <- models_list[[i+16]][["nobs"]]
  CI_lower[[i+16]] <- models_list[[i+16]][["coefficients"]][1,5]
  CI_upper[[i+16]] <- models_list[[i+16]][["coefficients"]][1,6]
  
  # covariate 18
  models_list[[i+17]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(liberalprofs1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+17]] <- models_list[[i+17]][["coefficients"]][1,1]
  std_error[[i+17]] <- models_list[[i+17]][["coefficients"]][1,2]
  model[[i+17]] <- "1872 census"
  term[[i+17]] <- models_list[[i+17]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+17]] <- models_list[[i+17]][["nobs"]]
  CI_lower[[i+17]] <- models_list[[i+17]][["coefficients"]][1,5]
  CI_upper[[i+17]] <- models_list[[i+17]][["coefficients"]][1,6]
  
  # covariate 19
  models_list[[i+18]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(trade1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+18]] <- models_list[[i+18]][["coefficients"]][1,1]
  std_error[[i+18]] <- models_list[[i+18]][["coefficients"]][1,2]
  model[[i+18]] <- "1872 census"
  term[[i+18]] <- models_list[[i+18]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+18]] <- models_list[[i+18]][["nobs"]]
  CI_lower[[i+18]] <- models_list[[i+18]][["coefficients"]][1,5]
  CI_upper[[i+18]] <- models_list[[i+18]][["coefficients"]][1,6]
  
  # covariate 20
  models_list[[i+19]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(industry1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+19]] <- models_list[[i+19]][["coefficients"]][1,1]
  std_error[[i+19]] <- models_list[[i+19]][["coefficients"]][1,2]
  model[[i+19]] <- "1872 census"
  term[[i+19]] <- models_list[[i+19]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+19]] <- models_list[[i+19]][["nobs"]]
  CI_lower[[i+19]] <- models_list[[i+19]][["coefficients"]][1,5]
  CI_upper[[i+19]] <- models_list[[i+19]][["coefficients"]][1,6]
  
  # covariate 21
  models_list[[i+20]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(agriculture1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+20]] <- models_list[[i+20]][["coefficients"]][1,1]
  std_error[[i+20]] <- models_list[[i+20]][["coefficients"]][1,2]
  model[[i+20]] <- "1872 census"
  term[[i+20]] <- models_list[[i+20]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+20]] <- models_list[[i+20]][["nobs"]]
  CI_lower[[i+20]] <- models_list[[i+20]][["coefficients"]][1,5]
  CI_upper[[i+20]] <- models_list[[i+20]][["coefficients"]][1,6]
  
  # covariate 22
  models_list[[i+21]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ HH_expend1901 + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+21]] <- models_list[[i+21]][["coefficients"]][1,1]
  std_error[[i+21]] <- models_list[[i+21]][["coefficients"]][1,2]
  model[[i+21]] <- "Other historical"
  term[[i+21]] <- models_list[[i+21]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+21]] <- models_list[[i+21]][["nobs"]]
  CI_lower[[i+21]] <- models_list[[i+21]][["coefficients"]][1,5]
  CI_upper[[i+21]] <- models_list[[i+21]][["coefficients"]][1,6]
  
  # covariate 23
  models_list[[i+22]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ log(schools_total1894+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+22]] <- models_list[[i+22]][["coefficients"]][1,1]
  std_error[[i+22]] <- models_list[[i+22]][["coefficients"]][1,2]
  model[[i+22]] <- "Other historical"
  term[[i+22]] <- models_list[[i+22]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+22]] <- models_list[[i+22]][["nobs"]]
  CI_lower[[i+22]] <- models_list[[i+22]][["coefficients"]][1,5]
  CI_upper[[i+22]] <- models_list[[i+22]][["coefficients"]][1,6]
  
  # covariate 24
  models_list[[i+23]] <- summary(lm_robust(as.formula(paste(outcomes[i],"~ share_cath_schools1894 + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq")),
                                           clusters = id_bureau,
                                           fixed_effects = as.factor(ar_name),
                                           data = main))
  estimate[[i+23]] <- models_list[[i+23]][["coefficients"]][1,1]
  std_error[[i+23]] <- models_list[[i+23]][["coefficients"]][1,2]
  model[[i+23]] <- "Other historical"
  term[[i+23]] <- models_list[[i+23]][["call"]][["formula"]][[2]][[3]]
  nobs[[i+23]] <- models_list[[i+23]][["nobs"]]
  CI_lower[[i+23]] <- models_list[[i+23]][["coefficients"]][1,5]
  CI_upper[[i+23]] <- models_list[[i+23]][["coefficients"]][1,6]
  
}

# PUT TOGETHER DATA FRAME FOR PLOTTING -------

models_df <- data.frame(
  outcome = outcomes,
  variables = as.character(term),
  estimate = estimate,
  std_error = std_error,
  model = model,
  nobs = nobs,
  CI_lower = CI_lower,
  CI_upper = CI_upper)

models_df$variables <- stringr::str_remove(string = models_df$variables, pattern = "^~ log\\(")
models_df$variables <- stringr::str_remove(string = models_df$variables, pattern = "^~ ")
models_df$variables <- stringr::str_remove(string = models_df$variables, pattern = "\\+.*$")
models_df$variables <- stringr::str_trim(string = models_df$variables, side = "both")

models_df$variables <- str_replace_all(string = models_df$variables, pattern = "empty_ballots_14_pct", replacement = "Casted empty ballots")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "SFIO_14_pct", replacement = "SFIO votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "REPSOC_14_pct", replacement = "PRS votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "RADSOC_14_pct", replacement = "Rad. Socialist votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "RADIND_14_pct", replacement = "Rad. Indep. votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "PRDS_14_pct", replacement = "PRDS votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "Progressistes_14_pct", replacement = "Progressistes votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "ALP_14_pct", replacement = "ALP votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "Divers_14_pct", replacement = "Other votes")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "verdun", replacement = "Fought at Verdun")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "marne", replacement = "Fought at Marne")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "somme", replacement = "Fought at Somme")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "aisne", replacement = "Fought at Aisne")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "chemindesdames", replacement = "Fought at Chemin des Dames")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "OutsideFrance", replacement = "Fought outside France")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "unemployed1872_share", replacement = "Share unemployed")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "liberalprofs1872_share", replacement = "Share in liberal occup.")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "trade1872_share", replacement = "Share in trade")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "industry1872_share", replacement = "Share in industry")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "agriculture1872_share", replacement = "Share in agricul.")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "HH_expend1901", replacement = "Avg. household expend. (1901)")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "schools_total1894", replacement = "Number schools (1894)")
models_df$variables <- str_replace_all(string = models_df$variables, pattern = "share_cath_schools1894", replacement = "Share cath. schools (1894)")

# PLOT ---------------------------------------------------------------

models_df %>%
  filter(variables != "turnout_14_pct") %>% 
  ggplot(aes(reorder(variables, -estimate), 
             estimate, 
             color = factor(model, levels = c("Electoral indicators (1914)", "WWI battles (1914-18)", "1872 census", "Other historical")), 
             shape = factor(model, levels = c("Electoral indicators (1914)", "WWI battles (1914-18)", "1872 census", "Other historical"))
  )) +
  geom_hline(aes(yintercept = 0), linetype = 2, color = "gray20") +
  geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), 
                 linewidth = 1) +
  geom_point(size = 1.75
  ) +
  theme_bw() +
  theme(legend.title = element_blank(),
        legend.position="bottom",
        panel.background = element_rect(fill = NA, color = "black")
  ) +
  labs(
    # title = "Covariate balance test of WWI soldier death rates one pre-WWI variables (ignorability assumption)",
    subtitle = "",
    x = "Covariates",
    y = "Estimates"
  ) +
  scale_color_viridis_d() +
  coord_flip()

# SAVE ---------------------------------------------------------------

ggsave(
  "A5_covtest_preWW1.png",
  plot = last_plot(),
  path = "./00 SUBMITTED/00 APSR final/03 dataverse_online_appendix/figures",
  width = 20,
  height = 24,
  units = "cm",
  dpi = 300
)

